1 Abstract

This script assembles R object from data downloaded from NCBI/Gene Expression Omnibus Database (GSE112679). The R objects will be assembled into an R dat package for easy access. The objects are constructed to parallel the analyses of these data are reported in Cai et al. (2019) [1]).

2 Set Analysis Parameters

 GSE_ID <- 'GSE112679'

 # extData
 ################################
 if(file.exists(file.path('../../extData'))) 
 EXT_DIR <- file.path('../../extData') else stop("Speficy EXT_DIR")

 # GSE_Data
 ###################################
 if(file.exists(file.path(EXT_DIR, GSE_ID)))
 GSE_DATA_DIR <- file.path(EXT_DIR,GSE_ID) else stop("Speficy GSE_DATA_DIR")

 # SampleDesc
 ###################################
 if(file.exists(file.path(GSE_DATA_DIR, paste0(GSE_ID,'_series_matrix.txt'))))
 SAMPLE_DESC_FILE <- file.path(GSE_DATA_DIR, paste0(GSE_ID,'_series_matrix.txt')) else 
 stop("Speficy SAMPLE_DESC_FILE")
# Define utility function
readSeqData.f <- function(seqFile, Verbose=F){
 seqFile.frm <- fread(seqFile)
 names(seqFile.frm) <- c('GeneId','Chr', 'Start', 'End', 'Strand', 'Length', 'Count')

 # Reformat for ordering
 seqFile.frm$Chr <- 
  as.character(factor(seqFile.frm$Chr,
               levels=paste0('chr', c(1:22,'X','Y','M')),
               labels=paste0('chr', c(formatC(1:22,width=2,flag='0'),'X','Y','M'))))

 rownames(seqFile.frm) <-
 with(seqFile.frm, 
       paste(Chr, Strand, Start, End, GeneId, sep='~'))

 seqFile.frm <- seqFile.frm[,-(1:6),drop=F]
 if(Verbose) print(seqFile.frm[1:6,,drop=F])
 if(Verbose) print(dim(seqFile.frm))

 seqFile.frm}

3 Load analysis dataset

3.1 Get Sample Description Data

# kelly's colors - https://i.kinja-img.com/gawker-media/image/upload/1015680494325093012.JPG
 # https://gist.github.com/ollieglass/f6ddd781eeae1d24e391265432297538
 #KellyColors.vec <-  see web site
 # REMOVED '#F2F3F4' in first entry
 col_vec <-  c('#222222', '#F3C300', '#875692', '#F38400', '#A1CAF1', 
              '#BE0032', '#C2B280', '#848482', '#008856', '#E68FAC', '#0067A5',
              '#F99379', '#604E97', '#F6A600', '#B3446C', '#DCD300', '#882D17',
              '#8DB600', '#654522', '#E25822', '#2B3D26')

 suppressPackageStartupMessages(require(GEOquery))

 GSEMatrix_obj <- getGEO(GSE_ID, GSEMatrix=T)
## Found 1 file(s)
## GSE112679_series_matrix.txt.gz
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
## File stored at:
## /var/folders/wd/v5lfllh152x0hwhtr773cnmnjqcpdt/T//RtmpPWAJZr/GPL18573.soft
 show(GSEMatrix_obj)
## $GSE112679_series_matrix.txt.gz
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 0 features, 2606 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: GSM3076193 GSM3076194 ... GSM3905974 (2606 total)
##   varLabels: title geo_accession ... training/validation group:ch1 (46
##     total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
##   pubMedIds: 31358576 
## Annotation: GPL18573
 DT::datatable(pData(phenoData(GSEMatrix_obj[[1]])))
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
 KEY_FIELDS <- grep(':ch1$', names(pData(phenoData(GSEMatrix_obj[[1]]))), value=T)

 sampType_vec <- sapply(
 strsplit(as.character(pData(phenoData(GSEMatrix_obj[[1]]))[,"title"]), 
 split='_'), function(X) X[1])

 sampNo_vec <- sapply(
 strsplit(as.character(pData(phenoData(GSEMatrix_obj[[1]]))[,"title"]), 
 split='_'), function(X) X[2])

 bioSamples_vec <- sapply(
 strsplit(as.character(pData(phenoData(GSEMatrix_obj[[1]]))[,"relation"]), 
 split='/'), function(X) rev(X)[1])

 SRA_vec <- sapply(
 strsplit(as.character(pData(phenoData(GSEMatrix_obj[[1]]))[,"relation.1"]), 
 split='='), function(X) rev(X)[1])

 fileName_vec <- sapply(
 strsplit(as.character(pData(phenoData(GSEMatrix_obj[[1]]))[,"supplementary_file_1"]), 
 split='/'), function(X) rev(X)[1])

 sampDesc_frm <- data.frame(
   geoAcc=pData(phenoData(GSEMatrix_obj[[1]]))[,"geo_accession"],
   title=pData(phenoData(GSEMatrix_obj[[1]]))[,"title"],
   sampType=sampType_vec,
   sampNo=sampNo_vec,
   bioSample=bioSamples_vec, 
   SRA=SRA_vec,
   fileName=fileName_vec,
   pData(phenoData(GSEMatrix_obj[[1]]))[, KEY_FIELDS])

 names(sampDesc_frm) <- 
 sub('bclc.stage.ch1', 'stage',
 sub('diagnosis.ch1', 'Dx',
 sub('tissue.subtype.ch1', 'tissueSubtype',
 sub('tissue.ch1', 'tissue',
 sub('training.validation.group.ch1', 'trainValGroup',
 names(sampDesc_frm))))))

 # Make sure fileName and geoAcc match
 fileNamegeoAcc.vec <- sapply(
 strsplit(as.character(sampDesc_frm$fileName), split='_'), '[',1)
 if(sum(fileNamegeoAcc.vec!= sampDesc_frm$geoAcc))
 stop("geoAcc/fileName Mismatch")

 str(sampDesc_frm)
## 'data.frame':    2606 obs. of  12 variables:
##  $ geoAcc       : chr  "GSM3076193" "GSM3076194" "GSM3076195" "GSM3076196" ...
##  $ title        : chr  "blood_1" "blood_2" "blood_3" "blood_4" ...
##  $ sampType     : chr  "blood" "blood" "blood" "blood" ...
##  $ sampNo       : chr  "1" "2" "3" "4" ...
##  $ bioSample    : chr  "SAMN08869478" "SAMN08869477" "SAMN08869476" "SAMN08869475" ...
##  $ SRA          : chr  "SRX3890002" "SRX3890003" "SRX3890004" "SRX3890005" ...
##  $ fileName     : chr  "GSM3076193_Seq_1.genebody.txt.gz" "GSM3076194_Seq_2.genebody.txt.gz" "GSM3076195_Seq_3.genebody.txt.gz" "GSM3076196_Seq_4.genebody.txt.gz" ...
##  $ stage        : chr  NA NA NA NA ...
##  $ Dx           : chr  "CHB" "Liver cirrhosis" "Liver cirrhosis" "Healthy" ...
##  $ tissueSubtype: chr  NA NA NA NA ...
##  $ tissue       : chr  "plasma" "plasma" "plasma" "plasma" ...
##  $ trainValGroup: chr  "Training" "Training" "Training" "Training" ...
 cat("geoAcc is unique - use as rownames:\n")
## geoAcc is unique - use as rownames:
 with(sampDesc_frm, table(table(geoAcc, exclude=NULL)))
## 
##    1 
## 2606
 rownames(sampDesc_frm) <- sampDesc_frm$geoAcc
 
 cat("SRA is unique:\n")
## SRA is unique:
 with(sampDesc_frm, table(table(SRA, exclude=NULL)))
## 
##    1 
## 2606
 cat("bioSample is unique:\n")
## bioSample is unique:
 with(sampDesc_frm, table(table(bioSample, exclude=NULL)))
## 
##    1 
## 2606
 cat("title is unique:\n")
## title is unique:
 with(sampDesc_frm, table(table(title, exclude=NULL)))
## 
##    1 
## 2606
 cat("Some Samples Match bu sampNo:")
## Some Samples Match bu sampNo:
 with(sampDesc_frm, table(table(sampNo, exclude=NULL)))
## 
##    1    2    3 
## 2531    3   23
 # NOTE: examination of the data indicate that sampNo cannot be used
 # to match Blood with TU or TI samples

DEPRICATED <- function() {
 NoSamp.tbl <- with(sampDesc_frm, table(sampNo, exclude=NULL))

 sampDesc_frm <- merge(
  data.frame(sampNo=names(nSamp.tbl), nSamp=as.vector(nSamp.tbl)),
  sampDesc_frm, by='sampNo', all.y=T)
}#DEPRICATED

 sampDesc_frm <- sampDesc_frm[with(sampDesc_frm,
  order(as.numeric(sampNo), title)),]


 # Shorten trainValGroup
 sampDesc_frm$trainValGroup <- 
 sub('Training', 'Train',
 sub('Validation', 'Val',
 sampDesc_frm$trainValGroup))

 # Shorten Dx
 sampDesc_frm$Dx <- 
 sub('Benign liver lesions', 'Benign',
 sub('Liver cirrhosis', 'Cirrhosis',
 sampDesc_frm$Dx))


 trainValGroupDX.tbl <-  
 with(sampDesc_frm, 
 table(Dx_tissue=paste0(Dx,'_',tissue), trainValGroup, exclude=NULL))

 trainValGroupDX.tbl
##                   trainValGroup
## Dx_tissue          Train Val-1 Val-2 <NA>
##   Benign_plasma      253   132     3    0
##   CHB_plasma         190    96     0    0
##   Cirrhosis_plasma    73    33     0    0
##   HCC_liver            0     0     0   52
##   HCC_plasma         335   809    60    0
##   Healthy_plasma     269   124   177    0
 barplot(trainValGroupDX.tbl, beside=T, legend=T, 
    col=col_vec[1:nrow(trainValGroupDX.tbl)])
 title("Sample Counts by Tissue Type")

 # For consistency with previous code, we will use Outcome as an alias to Dx,
 # and sampID as an alias to geoAcc
 sampDesc_frm$outcome <- sampDesc_frm$Dx
 sampDesc_frm$sampID <- sampDesc_frm$geoAcc

 cat("Cai et al. combine Beign+Healthy and Cirrhosis+HCC\n")
## Cai et al. combine Beign+Healthy and Cirrhosis+HCC
 cat("Create secondary outcome\n")
## Create secondary outcome
 sampDesc_frm$outcome2 <- with(sampDesc_frm, 
 ifelse(Dx %in% c("Benign", "Healthy"), 'BenignHealthy',
 ifelse(Dx %in% c("Cirrhosis", "CHB"), 'CirrhosisCHB', Dx)))

 with(sampDesc_frm, table(trainValGroup, outcome2, exclude=NULL))
##              outcome2
## trainValGroup BenignHealthy CirrhosisCHB HCC
##         Train           522          263 335
##         Val-1           256          129 809
##         Val-2           180            0  60
##         <NA>              0            0  52
 cat("Also want outcome3 == HCC or nonHCC\n")
## Also want outcome3 == HCC or nonHCC
 sampDesc_frm$outcome3 <- with(sampDesc_frm,
 ifelse(Dx == 'HCC', 'HCC', 'nonHCC'))

 with(sampDesc_frm, table(trainValGroup, outcome3, exclude=NULL))
##              outcome3
## trainValGroup HCC nonHCC
##         Train 335    785
##         Val-1 809    385
##         Val-2  60    180
##         <NA>   52      0
 usethis::use_data(sampDesc_frm, overwrite=T)
## ✔ Setting active project to '/Users/fcollin/Documents/Projects/Rstuff/devPackages/GSE112679'
## ✔ Saving 'sampDesc_frm' to 'data/sampDesc_frm.rda'
## ● Document your data (see 'https://r-pkgs.org/data.html')

3.2 Sample Description Data Table

 DT::datatable(sampDesc_frm,  options=list(pageLength = 18))
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

STOP HERE

1. Cai, J., Chen, L., Zhang, Z., Zhang, X., Lu, X., Liu, W., Shi, G., Ge, Y., Gao, P., and Yang, Y. et al. Genome-wide mapping of 5-hydroxymethylcytosines in circulating cell-free dna as a non-invasive approach for early detection of hepatocellular carcinoma. Gut, gutjnl–2019–318882. Available at: http://gut.bmj.com/content/early/2019/07/28/gutjnl-2019-318882.abstract.